Polular open-source NLP library. It's used to perform:

building document or word vector
perform topic identification and document comparison https://radimrehurek.com/gensim/

Git Example http://tlfvincent.github.io/2015/10/23/presidential-speech-topics/



In [2]:

    
from gensim.corpora.dictionary import Dictionary
from nltk.tokenize import word_tokenize









    



C:\anaconda\lib\site-packages\gensim\utils.py:865: UserWarning: detected Windows; aliasing chunkize to chunkize_serial
  warnings.warn("detected Windows; aliasing chunkize to chunkize_serial")



In [3]:

    
my_documents = ['The movie was about a spaceship and aliens.',
 'I really liked the movie!',
 'Awesome action scenes, but boring characters.',
 'The movie was awful! I hate alien films.',
 'Space is cool! I liked the movie.',
 'More space films, please!',]



In [6]:

    
tokenized_docs = [word_tokenize(doc.lower())
    for doc in my_documents]
dictionary = Dictionary(tokenized_docs)



In [11]:

    
# Token id
dictionary.token2id









    Out[11]:





{'!': 12,
 ',': 16,
 '.': 8,
 'a': 4,
 'about': 3,
 'action': 14,
 'alien': 22,
 'aliens': 7,
 'and': 6,
 'awesome': 13,
 'awful': 20,
 'boring': 18,
 'but': 17,
 'characters': 19,
 'cool': 26,
 'films': 23,
 'hate': 21,
 'i': 9,
 'is': 25,
 'liked': 11,
 'more': 27,
 'movie': 1,
 'please': 28,
 'really': 10,
 'scenes': 15,
 'space': 24,
 'spaceship': 5,
 'the': 0,
 'was': 2}



In [12]:

    
dictionary[9]









    Out[12]:





'i'



In [9]:

    
corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]
#The first is the id and second is frequency 
corpus









    Out[9]:





[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)],
 [(0, 1), (1, 1), (9, 1), (10, 1), (11, 1), (12, 1)],
 [(8, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1)],
 [(0, 1),
  (1, 1),
  (2, 1),
  (8, 1),
  (9, 1),
  (12, 1),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1)],
 [(0, 1), (1, 1), (8, 1), (9, 1), (11, 1), (12, 1), (24, 1), (25, 1), (26, 1)],
 [(12, 1), (16, 1), (23, 1), (24, 1), (27, 1), (28, 1)]]



In [14]:

    
doc=corpus[4]
bow_doc = sorted(doc, key=lambda w: w[1], reverse=True)
bow_doc









    Out[14]:





[(0, 1), (1, 1), (8, 1), (9, 1), (11, 1), (12, 1), (24, 1), (25, 1), (26, 1)]



In [16]:

    
# Print the top 5 words of the document alongside the count
for word_id, word_count in bow_doc[:5]:
    print(dictionary.get(word_id), word_count)









    



the 1
movie 1
. 1
i 1
liked 1



In [21]:

    
## Create the word freq of all the docs
from collections import defaultdict
from itertools import chain
totalfreq = defaultdict(int)
for word_id, freq in chain.from_iterable(corpus):
    totalfreq[word_id] += freq



In [23]:

    
sorted_freq = sorted(totalfreq.items(), key=lambda w: w[1], reverse=True)



In [24]:

    
for word_id, word_count in sorted_freq[:5]:
    print(dictionary.get(word_id), word_count)









    



the 4
movie 4
. 4
! 4
i 3